<?php

/**
 * 数据抓取
 * @author 暮雨秋晨
 * @copyright 2014
 */

class Grab
{
    const FETCH_CURL = 1;
    const FETCH_GET = 2;

    /**
     * 获取单条数据
     */
    public static function fetch($url, $FetchMode = 2)
    {
        if (empty($url)) {
            throw new exception('Grab::fetch() can not use empty attribute', 0);
        }
        if (is_array($url)) {
            throw new exception('Grab::fetch() Cannot handle multiple data,please use Grab::fetchAll()',
                0);
        }
        if (!self::__URL_VERIFY($url)) {
            return false;
        }
        switch ($FetchMode) {
            case 1:
                $data = self::__FETCH_CURL($url);
                break;
            case 2:
                $data = self::__FETCH_GET($url);
                break;
            default:
                $data = self::__FETCH_GET($url);
        }
        if ($data) {
            return self::__ENCODE_2_UTF8($data);
        } else {
            return false;
        }
    }

    /**
     * 获取多条数据（处理数组）
     */
    public static function fetchAll(array $urls, $FetchMode = 2)
    {
        $data = array();
        foreach ($urls as $url) {
            if (is_array($url)) {
                $data[] = self::fetchAll($url, $FetchMode);
            } else {
                $data[] = self::fetch($url, $FetchMode);
            }
        }
        if (empty($data)) {
            return false;
        } else {
            return $data;
        }
    }

    /**
     * 使用CURL抓取数据
     */
    private static function __FETCH_CURL($url, $autoCookie = false)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60); //超时60S
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_HEADER, false);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Do Crawler');
        $res = curl_exec($ch);
        curl_close($ch);
        return $res;
    }

    /**
     * 使用FGC来抓取函数
     */
    private static function __FETCH_GET($url)
    {
        if ($res = file_get_contents($url)) {
            return $res;
        } else {
            return false;
        }
    }

    /**
     * 编码转换为UTF8
     */
    private static function __ENCODE_2_UTF8($string)
    {
        $encoding = "UTF-8";
        for ($i = 0; $i < strlen($string); $i++) {
            if (ord($string{$i}) < 128)
                continue;
            if ((ord($string{$i}) & 224) == 224) {
                //第一个字节判断通过
                $char = $string{++$i};
                if ((ord($char) & 128) == 128) {
                    //第二个字节判断通过
                    $char = $string{++$i};
                    if ((ord($char) & 128) == 128) {
                        $encoding = "UTF-8";
                        break;
                    }
                }
            }
            if ((ord($string{$i}) & 192) == 192) {
                //第一个字节判断通过
                $char = $string{++$i};
                if ((ord($char) & 128) == 128) {
                    // 第二个字节判断通过
                    $encoding = "GB2312";
                    break;
                }
            }
        }
        if (strtoupper($encoding) == strtoupper('UTF-8'))
            return $string;
        else
            return iconv($encoding, "UTF-8//IGNORE", $string);
    }

    /**
     * URL验证
     */
    private static function __URL_VERIFY($url)
    {
        return preg_match('/^http[s]?:\/\/' . '(([0-9]{1,3}\.){3}[0-9]{1,3}' .
            // IP形式的URL- 199.194.52.184
            '|' . // 允许IP和DOMAIN（域名）
            '([0-9a-z_!~*\'()-]+\.)*' . // 域名- www.
            '([0-9a-z][0-9a-z-]{0,61})?[0-9a-z]\.' . // 二级域名
            '[a-z]{2,6})' . // first level domain- .com or .museum
            '(:[0-9]{1,4})?' . // 端口- :80
            '((\/\?)|' . // a slash isn't required if there is no file name
            '(\/[0-9a-zA-Z_!~\'\(\)\[\]\.;\?:@&=\+\$,%#-\/^\*\|]*)?)$/Uis', $url) == 1;
    }
}

?>